import numpy
import numpy.random

# log2n = int(numpy.log2(n))
# for r0s in range(log2n + 1):
#   r0 = 1 << r0s
#   for r1s in range(r0s - 1, -1, -1):
#   #while (r1list[-1] >> 1) > 0:
#     #r1list.append(r1list[-1] >> 1)
#     r1 = 1 << r1s
#     for i in range(n):
#       j = i ^ r1
#       if i < j:
#         nmin = min(a[i], a[j])
#         nmax = max(a[i], a[j])
#         if i // r0 & 1:
#           a[i] = nmax
#           a[j] = nmin
#         else:
#           a[i] = nmin
#           a[j] = nmax
# print(a)
def gen_bitonic_round(r0s, r1s, n):
  r0 = 1 << r0s
  r1 = 1 << r1s
  ret = []
  for i in range(n):
    j = i ^ r1
    if (i < j):
      ret.append("nmax = max(a[%d], a[%d]);" % (i, j))
      ret.append("nmin = min(a[%d], a[%d]);" % (i, j))
      if i // r0 & 1:
        ret.append("a[%d] = nmax;" % i)
        ret.append("a[%d] = nmin;" % j)
      else:
        ret.append("a[%d] = nmin;" % i)
        ret.append("a[%d] = nmax;" % j)
  return "\n".join(ret)
# print(a)
# print(gen_bitonic(1, 0, 8))
# print(gen_bitonic(2, 1, 8))
# print(gen_bitonic(2, 0, 8))
# print(gen_bitonic(3, 2, 8))
# print(gen_bitonic(3, 1, 8))
# print(gen_bitonic(3, 0, 8))

# print(gen_bitonic(1, 0, 16))
# print(gen_bitonic(1, 0, 16))
# print(gen_bitonic(2, 1, 16))
# print(gen_bitonic(2, 0, 16))
# print(gen_bitonic(3, 2, 16))
# print(gen_bitonic(3, 1, 16))
# print(gen_bitonic(3, 0, 16))
# print(gen_bitonic(4, 3, 16))
# print(gen_bitonic(4, 2, 16))
# print(gen_bitonic(4, 1, 16))
# print(gen_bitonic(4, 0, 16))
# print(a)
def bitonic_reg_di(v):
  v0 = max(v[0], v[1])
  v1 = min(v[0], v[1])
  v2 = min(v[2], v[3])
  v3 = max(v[2], v[3])
  return numpy.asarray([v0, v1, v2, v3])
def bitonic_reg_id(v):
  v0 = min(v[0], v[1])
  v1 = max(v[0], v[1])
  v2 = max(v[2], v[3])
  v3 = min(v[2], v[3])
  return numpy.asarray([v0, v1, v2, v3])
def bitonic_reg_ii(v):
  v0 = min(v[0], v[1])
  v1 = max(v[0], v[1])
  v2 = min(v[2], v[3])
  v3 = max(v[2], v[3])
  return numpy.asarray([v0, v1, v2, v3])
def bitonic_reg_dd(v):
  v0 = max(v[0], v[1])
  v1 = min(v[0], v[1])
  v2 = max(v[2], v[3])
  v3 = min(v[2], v[3])
  return numpy.asarray([v0, v1, v2, v3])
def bitonic_reg_d(v):
  v0 = max(v[0], v[2])
  v1 = max(v[1], v[3])
  v2 = min(v[0], v[2])
  v3 = min(v[1], v[3])
  return numpy.asarray([v0, v1, v2, v3])
def bitonic_reg_i(v):
  v0 = min(v[0], v[2])
  v1 = min(v[1], v[3])
  v2 = max(v[0], v[2])
  v3 = max(v[1], v[3])
  return numpy.asarray([v0, v1, v2, v3])
def bitonic_2reg_i(v0, v1):
  return numpy.minimum(v0, v1), numpy.maximum(v0, v1)
def bitonic_2reg_d(v0, v1):
  return numpy.maximum(v0, v1), numpy.minimum(v0, v1)


# print(gen_bitonic(1, 0, 8))
# print(gen_bitonic(2, 1, 8))
# print(gen_bitonic(2, 0, 8))
# print(gen_bitonic(3, 2, 8))
# print(gen_bitonic(3, 1, 8))
# print(gen_bitonic(3, 0, 8))
# print(gen_bitonic(1, 0, 16))
def gen_bitonic(n):
  nregop1 = 0
  nregop2 = 0
  def gen_bitonic_round_vec(r0s, r1s, n):
    nonlocal nregop1
    nonlocal nregop2
    r0 = 1 << r0s
    r1 = 1 << r1s
    ret = []
    if r1s == 0:
      if r0s == 1:
        for i in range(0, n, 4):
          ret.append("a[%d:%d+4] = bitonic_reg_di(a[%d:%d+4]);" % (i, i, i, i))
          nregop1 += 1
      else:
        for i in range(0, n, 4):
          if i // r0 & 1:
            ret.append("a[%d:%d+4] = bitonic_reg_dd(a[%d:%d+4]);" % (i, i, i, i))
          else:
            ret.append("a[%d:%d+4] = bitonic_reg_ii(a[%d:%d+4]);" % (i, i, i, i))
          nregop1 += 1
    elif r1s == 1:
      for i in range(0, n, 4):
        if i // r0 & 1:
          ret.append("a[%d:%d+4] = bitonic_reg_d(a[%d:%d+4]);" % (i, i, i, i))
        else:
          ret.append("a[%d:%d+4] = bitonic_reg_i(a[%d:%d+4]);" % (i, i, i, i))
        nregop1 += 1
    else:
      for i in range(0, n, 4):
        j = i ^ r1
        if (i < j):
          # ret.append("nmax = max(a[%d], a[%d]);" % (i, j))
          # ret.append("nmin = min(a[%d], a[%d]);" % (i, j))
          if i // r0 & 1:
            ret.append("a[%d:%d+4], a[%d:%d+4] = bitonic_2reg_d(a[%d:%d+4], a[%d:%d+4])" % (i, i, j, j, i, i, j, j))
          else:
            ret.append("a[%d:%d+4], a[%d:%d+4] = bitonic_2reg_i(a[%d:%d+4], a[%d:%d+4])" % (i, i, j, j, i, i, j, j))
          nregop2 += 1
    return "\n".join(ret)
  log2n = int(numpy.log2(n))
  code = []
  for i in range(1, log2n + 1):
    for j in range(i - 1, -1, -1):
      code.append(gen_bitonic_round_vec(i, j, n))
  print("sorting %d nums, 1 reg ops: %d, 2 reg ops: %d" % (n, nregop1, nregop2))
  return "\n".join(code)
  # code = "\n".join([
  #   gen_bitonic_round_vec(1, 0, 16),
  #   gen_bitonic_round_vec(2, 1, 16),
  #   gen_bitonic_round_vec(2, 0, 16),
  #   gen_bitonic_round_vec(3, 2, 16),
  #   gen_bitonic_round_vec(3, 1, 16),
  #   gen_bitonic_round_vec(3, 0, 16),
  #   gen_bitonic_round_vec(4, 3, 16),
  #   gen_bitonic_round_vec(4, 2, 16),
  #   gen_bitonic_round_vec(4, 1, 16),
  #   gen_bitonic_round_vec(4, 0, 16),
  # ])
# print(gen_bitonic_vec(1, 0, 16))
# print(gen_bitonic_vec(2, 1, 16))
# print(gen_bitonic_vec(2, 0, 16))
# print(gen_bitonic_vec(3, 2, 16))
# print(gen_bitonic_vec(3, 1, 16))
# print(gen_bitonic_vec(3, 0, 16))
# print(gen_bitonic_vec(4, 3, 16))
# print(gen_bitonic_vec(4, 2, 16))
# print(gen_bitonic_vec(4, 1, 16))
# print(gen_bitonic_vec(4, 0, 16))
n = 64
# a = numpy.random.randint(0, 100, n)
# log2n = int(numpy.log2(n))
# ncmp = 0
# for r0s in range(log2n + 1):
#   r0 = 1 << r0s
#   for r1s in range(r0s - 1, -1, -1):
#     r1 = 1 << r1s
#     for iout in range(0, n, r0):
#       if ((iout >> r0s) & 1) == 0:
#         for iin in range(iout, iout + r0, r1 + r1):
#           for i in range(iin, iin + r1):
#             j = i + r1
#             nmin = min(a[i], a[j])
#             nmax = max(a[i], a[j])
#             a[i] = nmin
#             a[j] = nmax
#             ncmp += 1
#       else:
#         for iin in range(iout, iout + r0, r1 + r1):
#           for i in range(iin, iin + r1):
#             j = i + r1
#             nmin = min(a[i], a[j])
#             nmax = max(a[i], a[j])
#             a[j] = nmin
#             a[i] = nmax
#             ncmp += 1
# print(a, ncmp)
    # for i in range(n):
    #   j = i ^ r1
    #   if i < j:
    #     nmin = min(a[i], a[j])
    #     nmax = max(a[i], a[j])
    #     if i // r0 & 1:
    #       a[i] = nmax
    #       a[j] = nmin
    #     else:
    #       a[i] = nmin
    #       a[j] = nmax
code = gen_bitonic(n)
for i in range(1000):
  a = numpy.random.randint(0, 100, n)
  acpy = a.copy()
  exec(code)
  if (a[1:] < a[:-1]).any():
    print(acpy)
    print(a)
    break
